*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*
* Replication Files to: P-hacking in clinical trials and how incentives shape the distribution of results across phases 
* Date: April 2020
* Authors: Jérôme Adda, Christian Decker, and Marco Ottaviani
*
*	- Input: Raw Data
* 			 		 
*	- Output: Interim Datasets & 'data_counterfactual_analysis.dta'
*			 
* Topic: Generate dataset for main counterfactual analysis (primary outcomes)
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*

*------------------------*
* Section 0 -- Directory *
*------------------------*

clear all
clear matrix
cap log close
set more off

*-----*
* 0.1 *
*-----*

* Run the dofiles which defines all the globals for the folder's and sub-folders
* paths to both data and analysis.


if c(username) == "chrdec" {                          
  do "C:/Users/chrdec/Dropbox/clinical trials/stata/PNAS revision/2_analysis/dofiles/0_globals.do"        // 0_globals.do path

}

*-----*
* 0.2 * 
*-----*

* Set up the directory where data are stored
macro list
cd "${data}"
set matsize 11000

cap log off
log using "${run}${log}log_2.txt", replace

*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~

*-------------------*
* Table of content: *
*-------------------*
* Sections: 
* 1. Import Tables in Sata
* 2. Merge Tables I 
* 3. Generate Variables
* 4. Generate Rankings of Industry Sponsors by Firm Size
* 5. Codify Placebo Control
* 6. MeSH Classification for Condition Fixed Effects Market Size
* 7. Merge Tables II & Restrict Sample
* 8. Generate Dummies for Fixed Effects
* 9. Summary Statistics


*------------------------------------*
* Section 1 -- Import tables in Sata *
*------------------------------------*

import delimited "${data}${raw}studies.csv", varn(1) delimiter(comma) bindquote(strict) maxquotedrows(unlimited) clear
save "${data}${inter}studies.dta", replace

import delimited "${data}${raw}sponsors.csv", varn(1) delimiter(comma) bindquote(strict) maxquotedrows(unlimited) clear
keep if lead_or_collaborator=="lead"
unique nct_id
save "${data}${inter}lead_sponsors.dta", replace

import delimited "${data}${raw}outcomes.csv", varn(1) delimiter(comma) bindquote(strict) maxquotedrows(unlimited) clear
rename id outcome_id
save "${data}${inter}outcomes.dta", replace

import delimited "${data}${raw}outcome_analyses.csv", varn(1) delimiter(comma) bindquote(strict) maxquotedrows(unlimited) asdouble clear
save "${data}${inter}outcome_analyses.dta", replace

import delimited "${data}${raw}interventions.csv", varn(1) delimiter(comma) bindquote(strict) maxquotedrows(unlimited) clear
save "${data}${inter}interventions.dta", replace
keep if intervention_type=="Drug"
keep nct_id
duplicates drop
save "${data}${inter}intervention_drug_list.dta", replace

import delimited "${data}${raw}browse_conditions.csv", varn(1) delimiter(comma) bindquote(strict) maxquotedrows(unlimited) clear
save "${data}${inter}browse_conditions.dta", replace

import delimited "${data}${raw}intervention_other_names.csv", varn(1) delimiter(comma) bindquote(strict) maxquotedrows(unlimited) clear
rename name other_name
save "${data}${inter}intervention_other_names.dta", replace

*-----------------------------*
* Section 2 -- Merge Tables I *
*-----------------------------*

use "${data}${inter}outcomes.dta", clear
merge 1:m outcome_id using "${data}${inter}outcome_analyses.dta"
drop _merge
save "${data}${inter}results_merged.dta", replace

use "${data}${inter}studies.dta", clear
keep nct_id start_date completion_date study_type overall_status phase enrollment //brief_title official_title
merge 1:1 nct_id using "${data}${inter}lead_sponsors.dta", keepusing(agency_class name)
rename name agency
drop _merge

merge 1:m nct_id using "${data}${inter}results_merged.dta", keepusing(id non_inferiority_type p_value p_value_modifier p_value_description outcome_type)
drop _merge
rename id results_outcome_analysis_id
rename p_value p
gen p_provided=1
replace p_provided=0 if p<0 | p>1 | p==.
drop if p_provided==0
replace p_value_modifier="<" if p==0 & p_value_modifier!=">"
replace p=0.0001 if p==0 & p_value_modifier!=">"
recast str2045 p_value_description

merge m:1 nct_id using "${data}${inter}intervention_drug_list.dta"
keep if _merge==3
drop _merge


*---------------------------------*
* Section 3 -- Generate Variables *
*---------------------------------*

replace p_value_modifier="" if p_value_modifier=="," | p_value_modifier=="=" | p_value_modifier=="NS" | p_value_modifier=="=" | p_value_modifier=="â¤"
replace p_value_modifier=subinstr(p_value_modifier,"=","",10) 
replace p_value_modifier=subinstr(p_value_modifier,"p","",10) 
replace p_value_modifier=subinstr(p_value_modifier,",","",10) 
gen z_modifier=0
replace z_modifier=1 if p_value_modifier=="<"
replace z_modifier=-1 if p_value_modifier==">"

gen z=-invnormal(p/2)

gen start_year=substr(start_date,1,4)
destring start_year, replace
gen start_month=substr(start_date,6,2)
destring start_month, replace

gen completion_year=substr(completion_date,1,4)
destring completion_year, replace
gen completion_month=substr(completion_date,6,2)
destring completion_month, replace

gen Ph2=0
gen Ph3=0
replace Ph2=1 if phase=="Phase 2"
replace Ph3=1 if phase=="Phase 3"

gen INDUSTRY=0
replace INDUSTRY=1 if agency_class=="Industry"

gen NI=0
replace NI=1 if non_inferiority_type=="Non-Inferiority or Equivalence"

gen PRIMARY=0
replace PRIMARY=1 if outcome_type=="Primary"

keep if study_type=="Interventional"


*------------------------------------------------------------------*
* Section 4 -- Generate Rankings of Industry Sponsors by Firm Size *
*------------------------------------------------------------------*


** ranking by revenue 2018
gen rev_rank=.
replace rev_rank=1 if regexm(agency,"Johnson & Johnson") | regexm(agency,"Janssen Research & Development")
replace rev_rank=2 if regexm(agency,"AbbVie") | regexm(agency,"Abbott")
replace rev_rank=3 if regexm(agency,"Hoffmann-La Roche")
replace rev_rank=4 if regexm(agency,"Pfizer")
replace rev_rank=5 if regexm(agency,"Novartis")
replace rev_rank=6 if regexm(agency,"Bayer")
replace rev_rank=7 if regexm(agency,"GlaxoSmithKline") | regexm(agency,"GSK")
replace rev_rank=8 if regexm(agency,"Merck Sharp & Dohme Corp.")
replace rev_rank=9 if regexm(agency,"Sanofi")
replace rev_rank=10 if regexm(agency,"Eli Lilly")
replace rev_rank=11 if regexm(agency,"Amgen")
replace rev_rank=12 if regexm(agency,"Bristol-Myers Squibb")
replace rev_rank=13 if regexm(agency,"Gilead")
replace rev_rank=14 if regexm(agency,"AstraZeneca")
replace rev_rank=15 if regexm(agency,"Danaher") //no trials found??
replace rev_rank=16 if regexm(agency,"Boehringer Ingelheim")
replace rev_rank=17 if regexm(agency,"Takeda")
replace rev_rank=18 if regexm(agency,"Teva")
replace rev_rank=19 if regexm(agency,"Novo Nordisk")
replace rev_rank=20 if regexm(agency,"Merck KGaA")
*source: https://en.wikipedia.org/wiki/List_of_largest_biomedical_companies_by_revenue (collected from official financial statements)

gen top10rev=0
replace top10rev=1 if rev_rank<=10


** ranking by prescription sales 2018
gen rxsales_rank=.
replace rxsales_rank=1 if regexm(agency,"Pfizer")
replace rxsales_rank=2 if regexm(agency,"Hoffmann-La Roche")
replace rxsales_rank=3 if regexm(agency,"Novartis")
replace rxsales_rank=4 if regexm(agency,"Johnson & Johnson") | regexm(agency,"Janssen Research & Development")
replace rxsales_rank=5 if regexm(agency,"Merck Sharp & Dohme Corp.")
replace rxsales_rank=6 if regexm(agency,"AbbVie") | regexm(agency,"Abbott")
replace rxsales_rank=7 if regexm(agency,"Sanofi")
replace rxsales_rank=8 if regexm(agency,"GlaxoSmithKline") | regexm(agency,"GSK")
replace rxsales_rank=9 if regexm(agency,"Amgen")
replace rxsales_rank=10 if regexm(agency,"Gilead")
replace rxsales_rank=11 if regexm(agency,"Bristol-Myers Squibb")
replace rxsales_rank=12 if regexm(agency,"AstraZeneca")
replace rxsales_rank=13 if regexm(agency,"Eli Lilly")
replace rxsales_rank=14 if regexm(agency,"Bayer")
replace rxsales_rank=15 if regexm(agency,"Novo Nordisk")
replace rxsales_rank=16 if regexm(agency,"Takeda")
replace rxsales_rank=17 if regexm(agency,"Celgene")
replace rxsales_rank=18 if regexm(agency,"Shire")
replace rxsales_rank=19 if regexm(agency,"Boehringer Ingelheim")
replace rxsales_rank=20 if regexm(agency,"Allergan")
*source: Pharmaceutical Executive June 2019 (based on data from EvaluatePharma)


** ranking by RD spending 2018
gen rdspend_rank=.
replace rdspend_rank=1 if regexm(agency,"Hoffmann-La Roche")
replace rdspend_rank=2 if regexm(agency,"Johnson & Johnson") | regexm(agency,"Janssen Research & Development")
replace rdspend_rank=3 if regexm(agency,"Novartis")
replace rdspend_rank=4 if regexm(agency,"Pfizer")
replace rdspend_rank=5 if regexm(agency,"Merck Sharp & Dohme Corp.")
replace rdspend_rank=6 if regexm(agency,"Sanofi")
replace rdspend_rank=7 if regexm(agency,"AbbVie") | regexm(agency,"Abbott")
replace rdspend_rank=8 if regexm(agency,"AstraZeneca")
replace rdspend_rank=9 if regexm(agency,"Bristol-Myers Squibb")
replace rdspend_rank=10 if regexm(agency,"Eli Lilly")
replace rdspend_rank=11 if regexm(agency,"GlaxoSmithKline") | regexm(agency,"GSK")
replace rdspend_rank=12 if regexm(agency,"Celgene")
replace rdspend_rank=13 if regexm(agency,"Gilead")
replace rdspend_rank=14 if regexm(agency,"Amgen")
replace rdspend_rank=15 if regexm(agency,"Bayer")
replace rdspend_rank=16 if regexm(agency,"Boehringer Ingelheim")
replace rdspend_rank=17 if regexm(agency,"Takeda")
replace rdspend_rank=18 if regexm(agency,"Biogen")
replace rdspend_rank=19 if regexm(agency,"Novo Nordisk")
replace rdspend_rank=20 if regexm(agency,"Regeneron")
*source: Pharmaceutical Executive June 2019 (based on data from EvaluatePharma)

** ranking by no. relevant trials reported
tab agency if INDUSTRY==1, sort
gen rep_rank=.
replace rep_rank=1 if regexm(agency,"GlaxoSmithKline") | regexm(agency,"GSK") //with subsidies
replace rep_rank=2 if regexm(agency,"Pfizer") //with subsidies
replace rep_rank=3 if regexm(agency,"Merck Sharp & Dohme Corp.") //!= Merck KgaA
replace rep_rank=4 if regexm(agency,"Eli Lilly")
replace rep_rank=5 if regexm(agency,"Boehringer Ingelheim")
replace rep_rank=6 if regexm(agency,"AstraZeneca")
replace rep_rank=7 if regexm(agency,"Hoffmann-La Roche")
replace rep_rank=8 if regexm(agency,"Novartis")
replace rep_rank=9 if regexm(agency,"Takeda")
replace rep_rank=10 if regexm(agency,"Shire")
replace rep_rank=11 if regexm(agency,"Amgen")
replace rep_rank=12 if regexm(agency,"Bayer")
replace rep_rank=13 if regexm(agency,"Sanofi")
replace rep_rank=14 if regexm(agency,"Johnson & Johnson") | regexm(agency,"Janssen Research & Development")
replace rep_rank=15 if regexm(agency,"Gilead")
replace rep_rank=16 if regexm(agency,"Bristol-Myers Squibb")
replace rep_rank=17 if regexm(agency,"Otsuka") //with 2 different names
replace rep_rank=18 if regexm(agency,"AbbVie") | regexm(agency,"Abbott")
replace rep_rank=19 if regexm(agency,"Novo Nordisk")
replace rep_rank=20 if regexm(agency,"Merck KGaA")


save "${data}${inter}data_counterfactual_analysis_interim.dta", replace

*-------------------------------------*
* Section 5 -- Codify Placebo Control *
*-------------------------------------*

use "${data}${inter}interventions.dta",clear
keep nct_id name
replace name=lower(name)
gen placebo=0
replace placebo=1 if regexm(name,"placebo")==1 | regexm(name,"saline")==1 | regexm(name,"water")==1 | regexm(name,"nacl")==1 | regexm(name,"sugar")==1 | regexm(name,"pbo")==1 | regexm(name,"sham")==1 | regexm(name,"vehicle")==1
collapse (max) placebo, by(nct_id)
save "${data}${inter}placebo_list.dta",replace

*--------------------------------------------------------------------------*
* Section 6 -- MeSH Classification for Condition Fixed Effects Market Size *
*--------------------------------------------------------------------------*

use "${data}${raw}mesh_thesaurus.dta",clear
gen letter=substr(mesh_id,1,1)
tab letter,m
keep if letter=="C" | letter=="F"
gen code=substr(mesh_id,1,3)
drop if code=="F01" | code=="F02" | code=="F04" | code=="C22"
replace code="C12/C13" if code=="C12" | code=="C13"
replace code="C08/C09" if code=="C08" | code=="C09"
keep mesh_term code
duplicates drop
bysort mesh_term: gen j=_n
reshape wide code, i(mesh_term) j(j)
save "${data}${inter}mesh_thesaurus_wide.dta",replace


use "${data}${inter}browse_conditions.dta", clear
drop if mesh_term=="Disease"
merge m:1 mesh_term using "${data}${inter}mesh_thesaurus_wide.dta"
keep if _merge==3
drop _merge id mesh_term
gen i=_n
reshape long code, i(i) j(j)
drop if code==""
keep nct_id code
duplicates drop
bysort nct_id: gen N=_N 
drop if N>1 & code=="C23"
drop N

merge m:1 code using "${data}${raw}medicareD_spending_collapsed.dta"
drop _merge
replace total_spending=0 if code=="C07" | code=="C21" | code=="C24"

keep nct_id total_spending code
bysort nct_id: egen double max_spending = max(total_spending)
keep if total_spending==max_spending
drop max_spending


rename code mesh_code
rename total_spending mkt_size

save "${data}${inter}nct_market_size.dta", replace

*------------------------------------------------*
* Section 7 -- Merge Tables II & Restrict Sample *
*------------------------------------------------*

use "${data}${inter}data_counterfactual_analysis_interim.dta", clear
merge m:1 nct_id using "${data}${inter}placebo_list.dta"
drop if _merge==2
drop _merge

merge m:1 p_value_description using "${data}${raw}mht_adjustment_hand_collected.dta"
drop if _merge==2
replace p_adjust=0 if p_adjust==. 
replace cv_adjust=0 if cv_adjust==.
replace interim_adjust=0 if interim_adjust==.
drop _merge

keep if PRIMARY==1
keep if NI==0
replace z_modifier=0 if z_modifier==-1 & z==0

count if agency=="Colgate Palmolive" & (Ph2==1 | Ph3==1)
count if agency=="Colgate Palmolive" & p==.05 & (Ph2==1 | Ph3==1)
*137 out of 150
keep if agency!="Colgate Palmolive"

preserve
bysort nct_id: gen N=_N
duplicates drop nct_id, force

tab N if (Ph2==1 | Ph3==1), sort
sum N if (Ph2==1 | Ph3==1), d
restore

drop if nct_id=="NCT02799472" //drop trial with 211 primary outcomes

count if (Ph2==1 | Ph3==1)

merge m:1 nct_id using "${data}${inter}nct_market_size.dta"
drop if _merge==2
drop _merge

unique nct_id if (Ph2==1 | Ph3==1) & mkt_size==.
replace mesh_code="missing" if mesh_code==""

egen aux=mean(mkt_size)
replace mkt_size=aux if mkt_size==.
drop aux
replace mkt_size=. if mkt_size==0
egen aux=min(mkt_size)
replace mkt_size=aux if mkt_size==.
drop aux


gen D1=0
replace D1=1 if p==0.001 & z_modifier==1
gen D2=0
replace D2=1 if p==0.0001 & z_modifier==1
gen D0=1-D1-D2
gen sqrt_enrollment=sqrt(enrollment)

*-------------------------------------------------*
* Section 8 -- Generate Dummies for Fixed Effects *
*-------------------------------------------------*

tab completion_year
levelsof completion_year if completion_year>2006 & completion_year<2019, local(cys) // omitted category: before 2007
foreach y of local cys {
	gen cy_`y'=0
 	replace cy_`y'=1 if completion_year==`y'
}
replace cy_2018=1 if completion_year>2018 // cy_2018 means 2018 or later

tab mesh_code
gen mesh_aux=subinstr(mesh_code,"/","_",10)
levelsof mesh_aux if mesh_code!="C01" & mesh_code!="C02" & mesh_code!="C03" & mesh_code!="C11" & mesh_code!="C15" & mesh_code!="C24" & mesh_code!="C26" & mesh_code!="missing", local(code) clean // omitted category: missing/others (missing, C01,C02,C03,C11,C15,C24,C26)
foreach c of local code {
	gen mc_`c'=0
 	replace mc_`c'=1 if mesh_aux=="`c'"
}
drop mesh_aux

save "${data}${final}data_counterfactual_analysis.dta", replace

*---------------------------------*
* Section 9 -- Summary Statistics *
*---------------------------------*

use "${data}${final}data_counterfactual_analysis.dta", clear

keep if Ph2==1 | Ph3==1
count if INDUSTRY==0 & Ph2==1
count if INDUSTRY==0 & Ph2==1 & z>=1.959 & z_modifier!=-1
count if INDUSTRY==0 & Ph3==1
count if INDUSTRY==0 & Ph3==1 & z>=1.959 & z_modifier!=-1

count if INDUSTRY==1 & Ph2==1
count if INDUSTRY==1 & Ph2==1 & z>=1.959 & z_modifier!=-1
count if INDUSTRY==1 & Ph3==1
count if INDUSTRY==1 & Ph3==1 & z>=1.959 & z_modifier!=-1

count if D0==0 
count if 1
count if D0==1 & z_modifier!=0
count if z_modifier!=0

bysort nct_id: gen N=_N
duplicates drop nct_id, force
sum N,d


log close
cd "${run}${dofiles}" 
